*! version 5.0
* 13 August 2018
* NIDS

* THIS IS A FOOD AND NON-FOOD EXPENDITURE DO FILE: 8 OF 14

*=====================================================================================================================================
* GLOBALS FOR DATA FILES, DO FILES AND VERSION SUFFIXES

* DEFINED IN "W1 Food_NonFood Expenditure - Master  Food_NonFood Expenditure do file  (1 of 14).do"

*=====================================================================================================================================
* SETTING UP STATA TO RUN DO FILES

clear
cap clear matrix
set more off 

use "$DataOUT\tempdata6.dta", clear

**********************************************************************
***			Food2: Item Imputations
**********************************************************************
***			Small Observation Cases
**********************************************************************

*Creates new variables for non-food consumption items (at the moment just equal to the raw data)
forvalues a=1/52{
quietly gen e`a'imputeS=e2_2_`a'
}

*In cases where there are only a small number of observations for consumption of particular items, the missing values are simply imputed with a population
*wide median. This is because consumption of these items is so rare and idosyncratic that regressions would be spurious and cells too small to use medians

egen e29median=median(e2_2_29)
replace e29imputeS = e29median if e29impute==.& e2_1_29==1
egen e34median=median(e2_2_34)
replace e34imputeS = e34median if e34impute==.& e2_1_34==1
egen e39median=median(e2_2_39)
replace e39imputeS = e39median if e39impute==.& e2_1_39==1
egen e40median=median(e2_2_40)
replace e40imputeS = e40median if e40impute==.& e2_1_40==1
egen e49median=median(e2_2_49)
replace e49imputeS = e49median if e49impute==.& e2_1_49==1
egen e21median=median(e2_2_21)
replace e21imputeS = e21median if e21impute==.& e2_1_21==1

*********************************************************************
***		Imputing Using Regression Model
**********************************************************************

forvalues a=1/52{
quietly gen e`a'impute=e`a'imputeS
impute e2_2_`a'lg lgincome w1_h_dwlrms westerncape easterncape northerncape freestate kwazulunatal northwest gauteng mpumalanga  urban  ///
hhsizer maxage maxed  Asian White Coloured  grants anychildren if e2_1_`a'==1, gen(e`a'imputelg)

quietly replace e`a'impute = exp(e`a'imputelg) if e`a'impute==.
}

**********************************************************************
***		Overview of Imputations
**********************************************************************

*Note that the same values need to be imputed using the two different methods, so the following values are independent of which method is used.

*Generates Dummy variables indicating whether each food item was imputed
forvalues a=1/52{
gen e`a'imputed =1 if e`a'impute!=.&e2_2_`a'==.
quietly replace e`a'imputed=0 if e`a'impute==.|e2_2_`a'!=.
}

quietly recode e*imputed (0=.)

**Number of entries that need to be imputed for each variable, how many values were originally observed, and the quotient of the two for each non-food item
forvalues a=1/52{
egen impd`a' = count(e`a'imputed)
quietly egen nsmall`a' = count(e2_2_`a')
quietly gen imppercc`a' = impd`a'/nsmall`a'
}

quietly recode e*imputed (.=0)

**********************************************************************
***		Imputing Using Cell Medians
**********************************************************************

*Sets the cut off response rate for PSU and Districts. If psu medians are based on response rates lower than this they will not be used in imputations
global rate2 "0.6"

*Generates a new imputed value which will contain the cell median imputation, then replaces the missing values with psu medians
forvalues a=1/52{
quietly gen median`a'imp=e`a'imputeS
quietly replace median`a'imp=npsumedian`a' if e2_1_`a'==1&median`a'imp==.&psu`a'count>1&psunrate`a'>$rate2
quietly replace median`a'imp=ndismedian`a' if e2_1_`a'==1&median`a'imp==.&dis`a'count>1&disnrate`a'>$rate2
}

*Generates the median of the food item within the province and replaces any values that are still missing with this provincial median
forvalues a=1/52{
quietly egen nprovmedian`a' = median(e2_2_`a'), by(province)
quietly replace median`a'imp= nprovmedian`a' if median`a'imp==.&nprovmedian`a'!=.&e2_1_`a'==1
}

**********************************************************************
***		Examination of Imputed Values
**********************************************************************

*Generates a variable containing only the consumption values that WERE imputed for each non-food type. This is done for both methods of imputation (median and regression)
forvalues a=1/52{
quietly gen nfmedian`a'meds= median`a'imp if e`a'imputed==1
quietly gen nfimp`a'imps= e`a'impute if e`a'imputed==1
}

sum nfmedian*meds nfimp*imps, sep(0)

**Aggregate all imputed values for each household for each method of imputation
egen ntotalmeds = rowtotal(nfmedian*meds)
replace ntotalmeds =. if ntotalmeds==0
egen ntotalimps = rowtotal(nfimp*imps)
replace ntotalimps =. if ntotalimps==0

sum ntotalmeds ntotalimps, detail

gen lgntotalmeds =log(ntotalmeds)
gen lgntotalimps =log(ntotalimps)
gen lgnfoodtot =log(nfoodtot)

*scatter lgntotalmeds lgntotalimps 
*twoway (kdensity lgnfoodtot ) (kdensity lgntotalmeds) (kdensity lgntotalimps), legend(order(1 "Expenditure Raw" 2 "Expenditure Imputed" 3 "Expenditure Median"))

drop lgntotalmeds lgntotalimps lgnfoodtot 								//drop variables no longer required

*------------------------------------------------------------------------------------------------------------------
save "$DataOUT\tempdata7.dta", replace
